Lesson 4


Scatterplots and Perceived Audience Size

Notes:


Scatterplots

Notes:

setwd("/Users/brianchase/R")
getwd()
## [1] "/Users/brianchase/R"
library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')

qplot(x = age, y = friend_count, data = pf)

qplot(age, friend_count, data = pf)


What are some things that you notice right away?

Response:


ggplot Syntax

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + geom_point() + 
  xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).


Overplotting

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20) + 
  xlim(13, 90)
## Warning: Removed 4906 rows containing missing values (geom_point).

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_jitter(alpha = 1/20) + 
  xlim(13, 90)
## Warning: Removed 5184 rows containing missing values (geom_point).

What do you notice in the plot?

Response:


Coord_trans()

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) + 
  geom_point(alpha = 1/20, position = position_jitter(h = 0)) + 
  xlim(13, 90) + 
  coord_trans(y = "sqrt")
## Warning: Removed 5184 rows containing missing values (geom_point).

Look up the documentation for coord_trans() and add a layer to the plot that transforms friend_count using the square root function. Create your plot!

What do you notice?


Alpha and Jitter

Notes:

ggplot(aes(x = age, y = friendships_initiated), data = pf) +
  geom_point(alpha = 1/30, position = position_jitter(h = 0)) +
  coord_trans(y = "sqrt")


Overplotting and Domain Knowledge

Notes:


Conditional Means

Notes:

#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
                          friend_count_mean = mean(friend_count),
                          friend_count_median = median(friend_count),
                          n = n())
pf.fc_by_age <-arrange(pf.fc_by_age, age)

head(pf.fc_by_age)
## Source: local data frame [6 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Conditional Means Alternate Code

#install.packages('dplyr')
library(dplyr)

pf.fc_by_age <- pf %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count), 
            friend_count_median = median(friend_count), 
            n = n()) %>%
  arrange(age)

head(pf.fc_by_age, 20)
## Source: local data frame [20 x 4]
## 
##      age friend_count_mean friend_count_median     n
##    (int)             (dbl)               (dbl) (int)
## 1     13          164.7500                74.0   484
## 2     14          251.3901               132.0  1925
## 3     15          347.6921               161.0  2618
## 4     16          351.9371               171.5  3086
## 5     17          350.3006               156.0  3283
## 6     18          331.1663               162.0  5196
## 7     19          333.6921               157.0  4391
## 8     20          283.4991               135.0  3769
## 9     21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
## 11    23          202.8426                93.0  4404
## 12    24          185.7121                92.0  2827
## 13    25          131.0211                62.0  3641
## 14    26          144.0082                75.0  2815
## 15    27          134.1473                72.0  2240
## 16    28          125.8354                66.0  2364
## 17    29          120.8182                66.0  1936
## 18    30          115.2080                67.5  1716
## 19    31          118.4599                63.0  1694
## 20    32          114.2800                63.0  1443

Create your plot!

ggplot(aes(x = age, y = friend_count_median), data = pf.fc_by_age) + 
  geom_line() +
  scale_x_continuous(breaks = seq(13, 113, 2), limits = c(13, 113)) +
  coord_trans(y = "sqrt")

summary(pf.fc_by_age)
##       age      friend_count_mean friend_count_median       n         
##  Min.   : 13   Min.   : 84.02    Min.   : 45.0       Min.   :   9.0  
##  1st Qu.: 38   1st Qu.:108.15    1st Qu.: 54.5       1st Qu.: 136.0  
##  Median : 63   Median :134.15    Median : 66.0       Median : 801.0  
##  Mean   : 63   Mean   :212.46    Mean   :106.8       Mean   : 980.2  
##  3rd Qu.: 88   3rd Qu.:333.85    3rd Qu.:162.0       3rd Qu.:1099.0  
##  Max.   :113   Max.   :484.94    Max.   :275.0       Max.   :5196.0

Overlaying Summaries with Raw Data

Notes:

ggplot(aes(x = age, y = friend_count), data = pf) +
  xlim(13,90) +
  geom_point(alpha = 0.05, 
             position = position_jitter(h = 0),
             color = 'orange') +
  coord_trans(y= 'sqrt') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1),
            linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5),
            color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9),
            linetype = 2, color = 'blue') 
## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).

## Warning: Removed 4906 rows containing non-finite values (stat_summary).
## Warning: Removed 5172 rows containing missing values (geom_point).

Zoom into data with coord_cartesian function

ggplot(aes(x = age, y = friend_count), data = pf) +
  coord_cartesian(xlim = c(13,70), ylim = c(0,1000)) +
  geom_point(alpha = 0.05, 
             position = position_jitter(h = 0),
             color = 'orange') +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .1),
            linetype = 2, color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .5),
            color = 'blue') +
  geom_line(stat = 'summary', fun.y = quantile, fun.args = list(probs = .9),
            linetype = 2, color = 'blue') 

What are some of your observations of the plot?

Response:


Moira: Histogram Summary and Scatterplot

See the Instructor Notes of this video to download Moira’s paper on perceived audience size and to see the final plot.

Notes:


Correlation

Notes:

cor.test(pf$age, pf$friend_count, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
with(pf, cor.test(age, friend_count, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

Look up the documentation for the cor.test function.

What’s the correlation between age and friend count? Round to three decimal places. Response:


Correlation on Subsets

Notes:

with(subset(pf, age <= 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

correlation pearson kendall spearman


Correlation Methods

Notes:


Create Scatterplots

Notes:

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + 
  geom_point() + 
  xlim(0, quantile(pf$www_likes_recieved, 0.95)) +
  ylim(0, quantile(pf$likes_recieved, 0.95)) + 
  geom_smooth(method = 'lm', color = 'red')


Strong Correlations

Notes:

ggplot(aes(x = www_likes_received, y = likes_received), data = pf) + 
  geom_point() + 
  xlim(0, quantile(pf$www_likes_received, 0.95)) +
  ylim(0, quantile(pf$likes_received, 0.95)) + 
  geom_smooth(method = 'lm', color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).

What’s the correlation betwen the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.

with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  www_likes_received and likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

Response:


Moira on Correlation

Notes:


More Caution with Correlation

Notes:

#install.packages('alr3')
library(alr3)
## Loading required package: car
## Warning: package 'car' was built under R version 3.2.4
data(Mitchell)
?Mitchell

ggplot(aes(x = Month, y = Temp), data = Mitchell) + 
  geom_point()

Create your plot!


Noisy Scatterplots

  1. Take a guess for the correlation coefficient for the scatterplot.

  2. What is the actual correlation of the two variables? (Round to the thousandths place)

with(Mitchell, cor.test(Month, Temp, method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  Month and Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Making Sense of Data

Notes:

summary(Mitchell$Month)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   50.75  101.50  101.50  152.20  203.00
ggplot(data = Mitchell, aes(x = Month, y = Temp)) + 
  geom_point() +
  scale_x_discrete(breaks =seq(0,203,12))


ggplot(aes(x=(Month%%12),y=Temp),data=Mitchell)+ 
  geom_point()

A New Perspective

What do you notice? Response:

Watch the solution video and check out the Instructor Notes! Notes:


Understanding Noise: Age to Age Months

Notes:

ggplot(aes(x = age, y = friend_count_mean), data = pf.fc_by_age) + 
  geom_line()

head(pf.fc_by_age,10)
## Source: local data frame [10 x 4]
## 
##      age friend_count_mean friend_count_median     n
##    (int)             (dbl)               (dbl) (int)
## 1     13          164.7500                74.0   484
## 2     14          251.3901               132.0  1925
## 3     15          347.6921               161.0  2618
## 4     16          351.9371               171.5  3086
## 5     17          350.3006               156.0  3283
## 6     18          331.1663               162.0  5196
## 7     19          333.6921               157.0  4391
## 8     20          283.4991               135.0  3769
## 9     21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
pf.fc_by_age[17:19, ]
## Source: local data frame [3 x 4]
## 
##     age friend_count_mean friend_count_median     n
##   (int)             (dbl)               (dbl) (int)
## 1    29          120.8182                66.0  1936
## 2    30          115.2080                67.5  1716
## 3    31          118.4599                63.0  1694

Create an age_with_months variable

pf$age_with_months <- with(pf, age + (1 - dob_month / 12))

Age with Months Means

Programming Assignment

pf.fc_by_age_months <- pf %>%
  group_by(age_with_months) %>%
  summarise(friend_count_mean = mean(friend_count), 
            friend_count_median = median(friend_count), 
            n = n()) %>%
  arrange(age_with_months)

head(pf.fc_by_age_months, 10)
## Source: local data frame [10 x 4]
## 
##    age_with_months friend_count_mean friend_count_median     n
##              (dbl)             (dbl)               (dbl) (int)
## 1         13.16667          46.33333                30.5     6
## 2         13.25000         115.07143                23.5    14
## 3         13.33333         136.20000                44.0    25
## 4         13.41667         164.24242                72.0    33
## 5         13.50000         131.17778                66.0    45
## 6         13.58333         156.81481                64.0    54
## 7         13.66667         130.06522                75.5    46
## 8         13.75000         205.82609               122.0    69
## 9         13.83333         215.67742               111.0    62
## 10        13.91667         162.28462                71.0   130

Noise in Conditional Means

ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
  geom_line()


Smoothing Conditional Means

Notes:

p1 <- ggplot(aes(x = age, y = friend_count_mean), 
       data = subset(pf.fc_by_age, age < 71)) + 
  geom_line() +
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean), 
       data = subset(pf.fc_by_age_months, age_with_months < 71)) + 
  geom_line() +
  geom_smooth()

p3 <- ggplot(aes(x = round(age/5) * 5, y = friend_count), 
       data = subset(pf, age < 71)) + 
  geom_line(stat = 'summary', fun.y = mean)

library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)


Which Plot to Choose?

Notes:


Analyzing Two Variables

Reflection:


Click KnitHTML to see all of your hard work and to have an html page of this lesson, your answers, and your notes!